R Markdown

From the presivous analysis result, we try to find the pure relation between colorfullness/complexity and rating without considering user’s gender, age, education’ effects towards to model. Thus, we will regroup the dataset and then do the fitted linear regression model. Except, that, we will choose some web imgs as compare test after the model is finished. The imgs are eng(346, 347, 348, 349), webby(18, 19). grayscale(18, 19)…

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
train_dat <-  read.table("/Volumes/STF/Final_siemens/Research_Qi/Test4/train_list_color_complx.txt", quote="\"")
test_dat <- read.table("/Volumes/STF/Final_siemens/Research_Qi/Test4/test_list_color_complx.txt", quote="\"")

web <- c(train_dat$V1)
rate <- c(train_dat$V2)
color <- c(train_dat$V3)
compl <- c(train_dat$V4)
dat_tarin <- data.frame(rate, color, compl)
# analysis rate itself
par(mfrow=c(1, 2))
qqnorm(rate)
qqline(rate, col = 2)

hist(rate)

hist((compl)^0.4)

hist(color)

hist(color^0.4)

#qqnorm(rate, color)
par(mfrow=c(1, 2))

qqplot(color, rate, xlab = "colorfullness", ylab = "mean rate value")
qqplot(compl, rate, xlab = "complextymodel", ylab = "mean rate value")

# Individual Pearson correlation (rate, colorfulness, complexity)
df <- data.frame(rate, color, compl)
plot(df[,1:3])

cor(df[, 1:3], method = "pearson")
##             rate     color      compl
## rate   1.0000000 0.1981562 -0.1795888
## color  0.1981562 1.0000000  0.4352672
## compl -0.1795888 0.4352672  1.0000000

Normal Linear Regression (colorfulness + complexity)

lm1 <- lm(rate ~ color + compl)
summary(lm1)
## 
## Call:
## lm(formula = rate ~ color + compl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.83377 -0.50135  0.04571  0.60210  2.36165 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.46151    0.24011  18.581  < 2e-16 ***
## color        0.23815    0.03962   6.011 4.83e-09 ***
## compl       -0.28257    0.04887  -5.782 1.69e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9639 on 335 degrees of freedom
## Multiple R-squared:  0.1265, Adjusted R-squared:  0.1212 
## F-statistic: 24.25 on 2 and 335 DF,  p-value: 1.463e-10
anova(lm1)
## Analysis of Variance Table
## 
## Response: rate
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## color       1  13.991 13.9910  15.058 0.0001255 ***
## compl       1  31.067 31.0669  33.437 1.688e-08 ***
## Residuals 335 311.257  0.9291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")

cor.test(resid.nlm+rate, rate)
## 
##  Pearson's product-moment correlation
## 
## data:  resid.nlm + rate and rate
## t = 103.3291, df = 336, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9809903 0.9875723
## sample estimates:
##      cor 
## 0.984627
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5

mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]

par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)

mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)

predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
## 
##  Pearson's product-moment correlation
## 
## data:  mytest$rate and predi_value
## t = 4.3264, df = 96, p-value = 3.713e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2234223 0.5576606
## sample estimates:
##      cor 
## 0.403936

Linear + Interaction

lm1 <- lm(rate ~  color +  compl + color * compl)
summary(lm1)
## 
## Call:
## lm(formula = rate ~ color + compl + color * compl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.43933 -0.40221  0.07234  0.55138  1.72819 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.98304    0.89330   1.100   0.2739    
## color        1.11924    0.19267   5.809 8.51e-08 ***
## compl        0.46013    0.18991   2.423   0.0173 *  
## color:compl -0.17826    0.03672  -4.855 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7964 on 94 degrees of freedom
## Multiple R-squared:  0.3407, Adjusted R-squared:  0.3197 
## F-statistic: 16.19 on 3 and 94 DF,  p-value: 1.454e-08
anova(lm1)
## Analysis of Variance Table
## 
## Response: rate
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## color        1  0.953  0.9533   1.503    0.2233    
## compl        1 14.909 14.9091  23.506 4.913e-06 ***
## color:compl  1 14.952 14.9518  23.573 4.778e-06 ***
## Residuals   94 59.622  0.6343                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")

cor.test(resid.nlm+rate, rate)
## 
##  Pearson's product-moment correlation
## 
## data:  resid.nlm + rate and rate
## t = 34.3018, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9430459 0.9741126
## sample estimates:
##       cor 
## 0.9615429
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5

mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]

par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)

mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)

predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
## 
##  Pearson's product-moment correlation
## 
## data:  mytest$rate and predi_value
## t = 7.0438, df = 96, p-value = 2.821e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4357702 0.7009541
## sample estimates:
##       cor 
## 0.5837192

Linear + Interaction color reduce

lm1 <- lm(rate ~  compl + color * compl)
summary(lm1)
## 
## Call:
## lm(formula = rate ~ compl + color * compl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.43933 -0.40221  0.07234  0.55138  1.72819 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.98304    0.89330   1.100   0.2739    
## compl        0.46013    0.18991   2.423   0.0173 *  
## color        1.11924    0.19267   5.809 8.51e-08 ***
## compl:color -0.17826    0.03672  -4.855 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7964 on 94 degrees of freedom
## Multiple R-squared:  0.3407, Adjusted R-squared:  0.3197 
## F-statistic: 16.19 on 3 and 94 DF,  p-value: 1.454e-08
anova(lm1)
## Analysis of Variance Table
## 
## Response: rate
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## compl        1  7.265  7.2648  11.454 0.0010422 ** 
## color        1  8.598  8.5977  13.555 0.0003866 ***
## compl:color  1 14.952 14.9518  23.573 4.778e-06 ***
## Residuals   94 59.622  0.6343                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")

cor.test(resid.nlm+rate, rate)
## 
##  Pearson's product-moment correlation
## 
## data:  resid.nlm + rate and rate
## t = 34.3018, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9430459 0.9741126
## sample estimates:
##       cor 
## 0.9615429
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5

mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]

par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)

mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)

predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
## 
##  Pearson's product-moment correlation
## 
## data:  mytest$rate and predi_value
## t = 7.0438, df = 96, p-value = 2.821e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4357702 0.7009541
## sample estimates:
##       cor 
## 0.5837192

Original 5 factor + interaction 0.62

lm1 <- lm(rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + I(abs(color^3 - compl^3)) + color * compl)

summary(lm1)
## 
## Call:
## lm(formula = rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + 
##     I(abs(color^3 - compl^3)) + color * compl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49350 -0.38984  0.09987  0.48085  1.75916 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.075801   5.873379   0.524   0.6018    
## color                      1.494803   0.355571   4.204 6.12e-05 ***
## compl                      1.451452   1.620673   0.896   0.3728    
## I(color^(exp(1)))          0.003567   0.010599   0.337   0.7372    
## I(compl^(0.4))            -3.590708   6.799542  -0.528   0.5987    
## I(abs(color^3 - compl^3)) -0.003312   0.002630  -1.259   0.2111    
## color:compl               -0.263490   0.126954  -2.075   0.0408 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.782 on 91 degrees of freedom
## Multiple R-squared:  0.3846, Adjusted R-squared:  0.3441 
## F-statistic:  9.48 on 6 and 91 DF,  p-value: 4.451e-08
anova(lm1)
## Analysis of Variance Table
## 
## Response: rate
##                           Df Sum Sq Mean Sq F value    Pr(>F)    
## color                      1  0.953  0.9533  1.5589   0.21503    
## compl                      1 14.909 14.9091 24.3793 3.568e-06 ***
## I(color^(exp(1)))          1 12.425 12.4252 20.3175 1.947e-05 ***
## I(compl^(0.4))             1  1.986  1.9856  3.2468   0.07488 .  
## I(abs(color^3 - compl^3))  1  1.878  1.8780  3.0709   0.08307 .  
## color:compl                1  2.634  2.6343  4.3076   0.04076 *  
## Residuals                 91 55.651  0.6116                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")

cor.test(resid.nlm+rate, rate)
## 
##  Pearson's product-moment correlation
## 
## data:  resid.nlm + rate and rate
## t = 32.5322, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9371434 0.9713823
## sample estimates:
##       cor 
## 0.9575154
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5

mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]

par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)

mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)

predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
## 
##  Pearson's product-moment correlation
## 
## data:  mytest$rate and predi_value
## t = 7.7464, df = 96, p-value = 9.739e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4809596 0.7289148
## sample estimates:
##       cor 
## 0.6201938

According to QQplot result, the colorfulness and complexity should fit the mean rate value by normal distribution approximately. Thus, We will try two different regression model to see the result: normal linear regression model and fitted linear regression model

n.lm <- lm(rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + I(abs(color^3 - compl^3)))
summary(n.lm)
## 
## Call:
## lm(formula = rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + 
##     I(abs(color^3 - compl^3)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36206 -0.42498  0.08857  0.53424  1.72676 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -5.303775   4.341664  -1.222 0.224980    
## color                      0.975095   0.256943   3.795 0.000264 ***
## compl                     -1.632653   0.658419  -2.480 0.014971 *  
## I(color^(exp(1)))         -0.016101   0.004831  -3.333 0.001240 ** 
## I(compl^(0.4))             7.532503   4.259201   1.769 0.080288 .  
## I(abs(color^3 - compl^3))  0.001742   0.001012   1.722 0.088481 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.796 on 92 degrees of freedom
## Multiple R-squared:  0.3555, Adjusted R-squared:  0.3205 
## F-statistic: 10.15 on 5 and 92 DF,  p-value: 9.183e-08
anova(n.lm)
## Analysis of Variance Table
## 
## Response: rate
##                           Df Sum Sq Mean Sq F value    Pr(>F)    
## color                      1  0.953  0.9533  1.5048   0.22306    
## compl                      1 14.909 14.9091 23.5332 4.984e-06 ***
## I(color^(exp(1)))          1 12.425 12.4252 19.6124 2.609e-05 ***
## I(compl^(0.4))             1  1.986  1.9856  3.1341   0.07998 .  
## I(abs(color^3 - compl^3))  1  1.878  1.8780  2.9643   0.08848 .  
## Residuals                 92 58.285  0.6335                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))                 
plot(n.lm)

# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(n.lm)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")

cor.test(resid.nlm+rate, rate)
## 
##  Pearson's product-moment correlation
## 
## data:  resid.nlm + rate and rate
## t = 33.6614, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9410073 0.9731707
## sample estimates:
##       cor 
## 0.9601528

After finish the model, we then use test_list_color_complx.txt to test the model

rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5

# color2 = color
# 
# for (i in 1:length(color)) {
#      if(color[i] < 3.5) {
#       color2[i] = color[i] * 0.8
#     } else if (color[i] > 7) {
#       color2[i] = color[i] * 1.8
#     } else {
#       color2[i] = color[i]
#     }
# }
# 

mytest <- data.frame(rate, color, compl)

estimate <- predict(n.lm , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]

par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)

mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(n.lm , newdata = mytest)

predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
## 
##  Pearson's product-moment correlation
## 
## data:  mytest$rate and predi_value
## t = 7.277, df = 96, p-value = 9.308e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4512086 0.7105991
## sample estimates:
##       cor 
## 0.5962478
final_data <- data.frame(test_dat, predi_value)

After the linear model is done, we will compare the test result with CNN test result and real rate using Person table.

compare_table <- data.frame(mytest$rate, test_dat$V5, predi_value)
hist(mytest$rate)

plot(compare_table[,1:3])

cor(compare_table[, 1:3], method = "pearson")
##             mytest.rate test_dat.V5 predi_value
## mytest.rate   1.0000000   0.8492508   0.5962478
## test_dat.V5   0.8492508   1.0000000   0.5353826
## predi_value   0.5962478   0.5353826   1.0000000
# CNN predict
plot(test_dat$V5, mytest$rate, xlim=c(2, 7), ylim=c(2, 7), ylab = "Real Rate", xlab = "CNN predict rate")

p1 <- ggplot(test_dat, aes(x = test_dat$V2, y = test_dat$V5)) + geom_point(colour = "#48b9f3", size = 3, alpha = 1)
 
p1 + geom_smooth(method = lm, se = FALSE) + labs(x = "User Aesthetics Rating", y = "CNN Predict Rating") + xlim(2.2, 6.5) + ylim(2.2, 6.5) + theme_bw()
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

# linear regression
plot(predi_value, mytest$rate, xlim=c(2, 7), ylim=c(2, 7), ylab = "Real Rate", xlab = "regression predict rate")

new_dat <- data.frame(test_dat, predi_value)
p2 <- ggplot(new_dat, aes(x = new_dat$V2, y = predi_value)) + geom_point(colour = "#48b9f3", size = 3, alpha = 1)
p2 + geom_smooth(method = lm, se = FALSE) + labs(x = "User Aesthetics Rating", y = "Linear Predict Rating") + xlim(2.2, 6.5) + ylim(2.2, 6.5) + theme_bw()
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

# linear regression vs cnn
plot(predi_value, test_dat$V5 , xlim=c(2, 7), ylim=c(2, 7), ylab = "CNN predict rate", xlab = "regression predict 
rate")

true_rate = new_dat$V2
cnn_rate = new_dat$V5
linear_rate = predi_value



dat <- data.frame(xx =c(true_rate, cnn_rate, linear_rate), yy = rep(letters[1:3], each = length(cnn_rate)))


## Histogram user aesthetics rating
p4 <- ggplot(dat, aes(x = xx)) +
  geom_histogram(data = subset(dat, yy == 'a'), aes(y = ..density..), binwidth=.6, colour="black", fill = "white", alpha = 0.5) + 
  geom_density(data = subset(dat, yy == 'a'), fill = "#f7746c", colour = "#f7746c", alpha = 0.2) + xlim(1, 8)
p4 + theme_bw() 

## Histogram cnn predict rating
p6 <- ggplot(dat, aes(x = xx)) +
  geom_histogram(data = subset(dat, yy == 'b'), aes(y = ..density..), binwidth =.8, colour="black", fill = "white", alpha = 0.5) + 
  geom_density(data = subset(dat, yy == 'b'), colour = "#05bdc2", fill = "#05bdc2", alpha = 0.2) + xlim(1, 8)
  
p6 + theme_bw() 

p7 <- ggplot(dat, aes(x = xx)) +
   geom_histogram(data = subset(dat, yy == 'a'), aes(y = ..density..), fill = "#f7746c", alpha = 0.5) +
  geom_histogram(data = subset(dat, yy == 'b'), aes(y = ..density..), fill = "#05bdc2", alpha = 1) 
  
  p7 + theme_bw() 
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# p4 + stat_function(fun = dnorm, args = list(mean = mean(cnn_rate), sd = sd(cnn_rate)), lwd = 2, color = "red")


p5 <- ggplot(dat, aes(x = xx)) +geom_density(data = subset(dat, yy == 'a'), fill = NA, colour = "#f7746c", alpha = 1) + geom_density(data = subset(dat, yy == 'b'), fill = NA, colour = "#05bdc2", alpha = 1) + xlim(1,8) + labs(x = "rate")

p5 + theme_bw()

n = c(true_rate,cnn_rate)
ggplot(new_dat, aes(new_dat$V2), colour = "red") + geom_density(fill = "red", alpha = 0.2) + xlim(1, 8)

ggplot(new_dat, aes(new_dat$V4)) + geom_density(alpha = 0.1) + xlim(1, 8)
## Warning: Removed 1 rows containing non-finite values (stat_density).